/*******************************************************************************

[Last updated: June 4th, 2024]

This script performs the following tasks:

	1. Runs ITT regressions across multiple outcome groups
	2. Applies FDR adjustments
	3. Conducts regressions for heterogeneous treatment effects
	4. Stores relevant estimates in Stata matrices.

*******************************************************************************/


* Load the data file -----------------------------------------------------------

	if "$individual_run" != "Yes" {
	di "Program Stopped: Make sure you load the parameters in master.do"
	e
	}

	use $interim_data, clear

	
	
* Sample restriction and final data cleaning -----------------------------------

	* Limit the months to 1-12
	keep if inrange(month, 1, 12)
	
	* Rename variables to make them shorter
	* Since Stata has a restriction on matrix name
	* For variable description, refer to script {admin_analysis_2}
	
	gl oldnames l12_trr_C trr_nf_p2020_1_C trr_nf_p2020_2_C ///
				arrest_con_cop ///
				days_iod ///
				l23_trr_C trr_C ///
				l12_trr_s_i_C ///
				l12_trr_sa_injury_C l12_trr_s_h_C l12_trr_s_hos_i_ix_C ///
				ttt_score_cut_Am  ///
				award total_w_exo_accusations ///
				prs_score_w_arrs_cut_Am ///
				i_complaint_pri_cut_Am i_complaint_sec_cut_Am i_complaints_cut_Am
				
	gl newnames NLU UL1 UL2 ///
				DAR ///
				OI ///
				LTU AU ///
				NLUSI ///
				NLUSA NLUH NLUSH ///
				TTT ///
				AW COM ///
				AI ///
				COMP COMS COMI
	
	* Rename
	rename ($oldnames) ($newnames)

	
	
* Pre-defined set of outcomes (this limit what to run for month length) --------
		
	* Block of 4 months: this block has the most outcome
		gl outcomes_4m 					NLU UL1 UL2 ///
										DAR OI AI ///
										LTU AU ///
										NLUSI NLUSA NLUH NLUSH ///
										TTT ///
										AW COM COMI
			
	* Block of 3 months and 12 months
		gl outcomes_3m 					NLU DAR OI AI
		gl outcomes_12m 				NLU DAR OI AI COM
		
	* For some analysis, we just want to focus on the core outcomes
		gl outcomes_4m_core 			NLU DAR OI AI
		gl outcomes_3m_core 			NLU DAR OI AI
		

		
* Families for FDR adjustment --------------------------------------------------
		
	* Adverse outcomes
		gl family_adverse   			NLU DAR
		
	* Safety outcomes
		gl family_safety_4				OI AI
		gl family_safety_3				OI AI
		gl family_safety_12				OI AI
		
	* Auxiliary outcomes
		gl family_aux_b					NLUSI NLUSA NLUH NLUSH TTT
		
	* Downstream outcomes
		gl family_downstream			AW COM COMI
		
	* Place holder for outcome not in any family: we are not reporting 
	* q-value for these outcomes
	
		gl family_phantom				LTU AU UL1 UL2 COM
		

		
		
		
* Covariates list --------------------------------------------------------------
	
	* Month indicators
		
		gl month_list_4m				months_1_4 ///
										months_5_8 ///
										months_9_12
										
		gl month_list_3m				months_1_3 ///
										months_4_6 ///
										months_7_9 ///
										months_10_12
										
		gl month_list_12m				months_1_12
		
	* Month*treatment indicators
		
		gl month_treatment_list_4m		months_1_4_t ///
										months_5_8_t ///
										months_9_12_t
										
		gl month_treatment_list_3m		months_1_3_t ///
										months_4_6_t ///
										months_7_9_t ///
										months_10_12_t
										
		gl month_treatment_list_12m		months_1_12_t
		
	
	* Set of variales for non-LASSO regression
	gl picked5a		///
				pre_arrest_con_cop ///
				pre_prs_score_w_arrs ///
				pre_days_iod ///
				experience ///
				dblack dwhite dhispanic ///
				dmale 

	
	* Set of variable consideration for LASSO
	gl consideration_v	///
				pre_arrest_con_cop ///
				pre_prs_score_w_arrs ///
				pre_days_iod ///
				experience ///
				dblack dhispanic dwhite ///
				dmale
				
				
					

		
* (1A) OLS regressions ---------------------------------------------------------
		
	/* This loop run several regression specifications, notably [m4] and [h4]	
	 	Main specifications as [m4]
	 	Regression for Figure 2 as [h4] */
	
	   * Looping through three sets of time period
	   
	   foreach l in 4 3 12 {		
			
			* Set covariates list that matches the specification requirement
			
				* For the "single regression model"
				gl month_list 				${month_list_`l'm}
				gl month_treatment_list		${month_treatment_list_`l'm}
			
				* For the "multiple regression model"
				gl restriction				${month_list}
				
				* Reporting on
				gl reporting_target 		${month_treatment_list}
										
			
			
			* Regression and save estimates
			foreach o in ${outcomes_`l'm} {
				
				* Code [m4] One regression for each set of months, WITH baseline covariates
				gl covariates	${picked5a}									
				ALL_REG 		ITT_multi  	m`l' `o'
				
				* Code [S4] One regression for each set of months, WITHOUT baseline covariates
				gl covariates										
				ALL_REG 		ITT_multi  	S`l' `o'
				
				* Code [h4] One regression for all set of months, WITH baseline covariates									
				gl covariates	${month_list} ${month_treatment_list} ${picked5a}
				ALL_REG 		ITT_single 	h`l' `o'
				
				* Code [s4] One regression for all set of months, WITHOUT baseline covariates								
				gl covariates	${month_list} ${month_treatment_list}
				ALL_REG 		ITT_single 	s`l' `o'
					
		/* Note on specifications:
		For more details on those regression, see script {function}. In general:
						
						
			[m4][S4]
				Outcome ~ 	treatment $covariates i.strata i.year_month 
							with SEs clustered at Officer level
								- The sample is restricted (months_1_4 == 1, etc)
								- The "covariate set" is purely baseline covariates
								- We report the coefficent on "treatment"
								(one of each period of restriction)
					
			[h4][s4]
				Outcome ~ 	$covariates i.strata i.year_month 
							with covariates also include month and interactions
							with SEs clustered at Officer level
								- The sample is NOT restricted
								- The "covariate set" includes
									+ baseline covariates
									+ month period indicator
									+ month * treatment indicator
								- Report the coefficent on the interaction term
								(one for each indicator)
		*/
					
				
			}
		}
		   
		 
* (1A - auxiliary) Coefficient equality test -----------------------------------
* For a sub-set of outcome on Single regression form on [Figure 2]
		 
		gl month_list 				${month_list_4m}
		gl month_treatment_list		${month_treatment_list_4m}
		gl covariates 				${month_list} ///
									${month_treatment_list} ///
									${picked5a}
		gl reporting_target 		${month_treatment_list}
						
			foreach o in NLU DAR OI AI {
			
				ALL_REG ITT_single test `o'
							
						qui: test months_1_4_t = months_5_8_t  = months_9_12_t
						local p0 = r(p)
							
						qui: test months_1_4_t = months_5_8_t 
						local p1 = r(p)
							
						qui: test months_1_4_t = months_9_12_t
						local p2 = r(p)
								
						qui: test months_5_8_t = months_9_12_t
						local p3 = r(p)
						
						* Store results to a matrix
						mat test_`o' = `p0', `p1', `p2', `p3'
						mat colnames test_`o' = All 1-4vs5-8 1-4vs9-12 5-8vs9-12
						mat rownames test_`o' = `o'
						mat li test_`o'
						}	
						
		mat e_test = test_NLU \ test_DAR \ test_OI \ test_AI
			
		mat li e_test
		
		
		
* (1B) LASSO regression for main outcomes ---------------------------------------
		
	/* This loop run the LASSO regression, specifically:
		- Run a double-regression LASSO on the outcome "o"
		- Store the regression results for each variable 
		- Results are saved into matrix with name: p_M4v_outcome */
				   
				   
		foreach c in v {				// "c" is the version of consideration
		foreach l in 4 {				// options for 4-month and 3-month blocks
			
			* Set covariates
			
				* For single regression
				gl month_list 				${month_list_`l'm}
				gl month_treatment_list		${month_treatment_list_`l'm}
				gl consideration			${consideration_`c'}
			
				* For multiple regression
				gl restriction				$month_list
			
			* Run and save
			foreach o in ${outcomes_`l'm_core} {
				
				gl reporting_target $month_treatment_list

				* Code [M4v]: LASSO regression on [m4]
				ALL_REG LASSO_multi 	M`l'`c' 	`o'

				* Code [L4v]: LASSO regression on [h4]
				ALL_REG LASSO_single 	L`l'`c' 	`o'
								
			}
		}
		}
		
			
		
		
* (2) Calculate q-values ------------------------------------------------------
		
		/* 	The pq macro is stored in the "additional" script loaded
			at the begining of this file.
					
			The engine grab the matrix already in Stata memory, calculate
			q-value, then return a matrix with an additional column for q-value
					
			We do the FDR adjustment for all variables 
			within a family and within the same month-period 
			(e.g.: adjusting all p-values gathered for family A, for months 1-3.
					
			Results are saved into matrix with name
			q_p_M4v_outcome 
		*/

			
			foreach version in 	p_m4   	///
								p_M4v 	///
								p_S4 	///
								p_h4 	///
								p_m3    ///
								p_m12 	///
								{
				
			* Identify the correct family for each set of specifications
			* This is to save computing time instead of running on all outcome and family
			
				* For the main model
				
					if inlist("`version'", "p_m4") {
					gl famlist 		adverse safety_4 aux_b downstream phantom
					gl periodlist	1_4 5_8 9_12
					}
				
				* For robustness check model
					
					if inlist("`version'", "p_M4v", "p_S4", "p_h4") {
					gl famlist 		adverse safety_4
					gl periodlist	1_4 5_8 9_12
					}
				
					if inlist("`version'", "p_m3") {
					gl famlist 		adverse safety_3 
					gl periodlist	1_3 4_6 7_9 10_12
					}
					
					if inlist("`version'", "p_m12") {
					gl famlist 		adverse safety_12 
					gl periodlist	1_12
					}
			
			* Run FDR check and save
			foreach fam in $famlist {
			
			foreach period in $periodlist {
					
					* Construct a list of variable in the same group to be tested
					gl family
					foreach outcome in ${family_`fam'} {
					gl family 	$family 	`version'_`outcome'_months_`period'_t
					}
					
					* Check and calculate Q-value
					di "${family}"
					PQ 

				}	
			}
			}
		

		/* Note: While this portion of the code seems complex, it does just 
		three basic task:
		First, it form a list of matrix names for a given family  
		Then it call the program [PQ]. This program will collect 
		the corresponding p-values and plug them to the Anderson's code 	
		and appendix the matrix with the q-values */
		
* (3) Heterogeneous treatment effects ------------------------------------------

	* (4a)Initiate a place holder for the variables to be used 
	foreach v in character treatment_character {
	capture: gen `v' = .
	}
	
	* (4b) Clean up variable name
	
		* Share of Black and Hispanic at district level
		gen pBH 	= p_black_or_hispanic
		gen pB 		= p_black
		gen pH 		= p_hispanic
		gen pmB 	= inrange(p_black, 0.5, 1)
		gen pmH		= inrange(p_hispanic, 0.25, 1)

		* Crime rate at district level
		gen crime	= all_crime_monthly_1K
		gen vcrime	= violent_crime_monthly_1K
		
		* Baseline characteristics
		gen bDAR	= pre_arrest_con_cop
		gen bNLU	= pre_l12_rc_trr_C
		gen bCOM	= pre_total_w_exo_accusations
		gen pArr 	= pre_arrest_con_cop + pre_arrest_non_con_cop
		
		* Arrests on Black and non Black subjects at officer level 
		gen aB		= arr_a_black
		gen aNB		= arr_a_nblack
	
	
	// percent of black or hispanic in the neighborhood
	
	* (4c)Running the regression
	
		* Loop through variables
		mat het = J(1,8, .)
		
		
		foreach outcome in NLU DAR OI AI aB aNB {
		foreach v in ///
						experience ///
						dmale dwhite ///
						pBH pB pH pmB pmH  ///
						crime vcrime ///
						bDAR bNLU bCOM pArr {
			
			* Set of covariates (this is actually our traditional covariates in picked5a)

				gl covariates ///
										pre_arrest_con_cop ///
										pre_prs_score_w_arrs ///
										pre_days_iod ///
										experience dblack dwhite dhispanic dmale
		
			* Create double interaction terms
			
				* Storing the variable we are testing against
				qui: replace character = `v'
				
				* Interactions
				qui: replace treatment_character = treatment * character
				
				* Regression
				qui: reg `outcome' ///
							treatment character treatment_character ///
							$covariates i.strata i.year_month ///
							if (months_1_4 == 1), ///
							vce(cluster employee_id)
				
				qui: test treatment
				mat P1 =  _b[treatment], _se[treatment], r(p), e(N) 
				
				qui: test treatment_character
				mat P2 =  _b[treatment_character], _se[treatment_character], r(p), e(N)
				
				mat het_`v'_`outcome'_months_1_4_t = P1, P2
				mat li het_`v'_`outcome'_months_1_4_t
				mat het = het \ het_`v'_`outcome'_months_1_4_t
		}
		}
		
		* Descriptive stat for mediators
		foreach m in experience bNLU bDAR pArr bCOM {
		qui: sum `m' if months_1_4 == 1 & missing == 0, de
		mat d_`m' = r(mean), r(sd), r(min), r(p25), r(p50), r(p75), r(p90), r(max), r(N)
		}
		mat d = d_bNLU \ d_bDAR \ d_pArr \ d_bCOM \ d_experience
		mat li d
		
* End of script ----------------------------------------------------------------